/*
   This file was derived from the libwww code, version 2.15, from CERN.
   A number of modifications have been made by Spyglass.

   eric@spyglass.com
 */
/*      Parse HyperText Document Address        HTParse.c
   **       ================================
 */

#include "all.h"

#define HEX_ESCAPE '%'

struct struct_parts
{
	char *access;
	char *host;
	char *absolute;
	char *relative;
/*  char * search;      no - treated as part of path */
	char *anchor;
};


/*  Strip white space off a string
   **   ------------------------------
   **
   ** On exit,
   **   Return value points to first non-white character, or to 0 if none.
   **   All trailing white space is OVERWRITTEN with zero.
   ** CMF 3-29-95: strip everything after a cr or lf
   ** CMF 6-08-95: rather than strip everthing after a cr or lf, just delete them
   ** 			   this is for compatibility w/ netscape and some URLs
 */

PUBLIC char *HTStrip(char *s)
{
#define ISSPACE(c) ((c==' ')||(c=='\t')||(c=='\n')||(c=='\r'))
	char *p = s;
	char *q;

	if (!s)
		return NULL;			/* Doesn't dump core if NULL */
	while (ISSPACE(*s))
		s++;					/* Strip leading blanks */
	for (p = s, q = s; *q; q++) /* Find end of string & strip embedded CR/LF */
		 if (*q != CR && *q != LF)
		 	*p++ = *q;
	*p = '\0';
	for (p--; p >= s; p--)
	{
		if (ISSPACE(*p))
			*p = 0;				/* Zap trailing blanks */
		else
			break;
	}
	return s;
}

INLINE BOOL bIsAbsolute(const char *p)
{
	return (*p == '/' || *p == '\\' || (*p && (p[1] == ':' || p[1] == '|')));
}

INLINE BOOL bIsUNC(const char *p)
{
	return (!strncmp(p, "\\\\", 2)) || (!strncmp(p, "//", 2));
}

INLINE BOOL bIsDrive(const char *p)
{
	return (p[1] == ':' || p[1] == '|');
}

/*  Scan a filename for its consituents
   **   -----------------------------------
   **
   ** On entry,
   **   name    points to a document name which may be incomplete.
   ** On exit,
   **      absolute or relative may be nonzero (but not both).
   **   host, anchor and access may be nonzero if they were specified.
   **   Any which are nonzero point to zero terminated strings.
 */
PRIVATE void scan(char *name, struct struct_parts *parts)
{
	char *after_access;
	char *p;
	int length = strlen(name);

	parts->access = 0;
	parts->host = 0;
	parts->absolute = 0;
	parts->relative = 0;
	parts->anchor = 0;

	after_access = name;

	for (p = name; *p; p++)
	{
		if (*p == ':')
		{
			*p = 0;
			GTR_MakeStringLowerCase(after_access);
			if (!strcmp(after_access,"url"))
			{
				after_access = p + 1;
				continue;
			}
			parts->access = after_access;	/* Access name has been specified */
			after_access = p + 1;
			break;
		}
		if (*p == '/')
			break;
		if (*p == '#')
			break;
	}

	for (p = name + length - 1; p >= name; p--)
	{
		if (*p == '#')
		{
			parts->anchor = p + 1;
			*p = 0;				/* terminate the rest */
		}
	}
	p = after_access;
	if (parts->access && !strcmp(parts->access, "file"))
	{
	//	sometimes file: URLS look like 		file:///c|/blah.htm
	//							  or		file:///blah.htm
	//							  or		file:c:/blah.htm
	//							  or		file:/blah.htm
		if (!strncmp(p, "///", 3))
		{
			p += 3;
		}
		if (bIsAbsolute(p))
		{
			parts->absolute = ((*p != '\\' && *p != '/') || (!strncmp(p, "\\\\", 2)) ||(!strncmp(p, "//", 2))) ? p : p + 1;
		}
		else
		{
			parts->relative = (*after_access) ? after_access : 0;	/* zero for "" */
		}
	}
	else
	{
		if (*p == '/')
		{
			if (p[1] == '/')
			{
				char *phost;

				parts->host = p + 2;	/* host has been specified  */
				*p = 0;				/* Terminate access         */
				p = strchr(parts->host, '/');	/* look for end of host name if any */
				if (p)
				{
					*p = 0;			/* Terminate host */
					parts->absolute = p + 1;	/* Root has been found */
				}
				phost = NULL;
				if (parts->access && ((!strcmp(parts->access,"ftp")) ||	(!strcmp(parts->access,"telnet"))))
				{
					phost = strrchr(parts->host, '@'); 	/* login for telnet,ftp */
					if (!phost) phost = parts->host;
				}
				else if (parts->access && ((!strcmp(parts->access,"gopher")) || (!strcmp(parts->access,"http")) || (!strcmp(parts->access,"wais"))))
					phost = parts->host;
				if (phost) GTR_MakeStringLowerCase(phost);
			}
			else
			{
				parts->absolute = p + 1;	/* Root found but no host */
			}
		}
		else if (*p == '\\')
		{
			parts->absolute = p[1] == '\\' ? p : p + 1;
		}
		else
		{
			parts->relative = (*after_access) ? after_access : 0;	/* zero for "" */
		}
	}

#ifdef OLD_CODE
	/* Access specified but no host: the anchor was not really one
	   e.g. news:j462#36487@foo.bar -- JFG 10/jul/92, from bug report */
	/* This kludge doesn't work for example when coming across
	   file:/usr/local/www/fred#123
	   which loses its anchor. Correct approach in news is to
	   escape weird characters not allowed in URL.  TBL 21/dec/93
	 */
	if (parts->access && !parts->host && parts->anchor)
	{
		*(parts->anchor - 1) = '#';		/* Restore the '#' in the address */
		parts->anchor = 0;
	}
#endif

#ifdef NOT_DEFINED				/* search is just treated as part of path */
	{
		char *p = relative ? relative : absolute;
		if (p)
		{
			char *q = strchr(p, '?');	/* Any search string? */
			if (q)
			{
				*q = 0;			/* If so, chop that off. */
				parts->search = q + 1;
			}
		}
	}
#endif
}								/*scan */

/*  Parse a Name relative to another name
   **   -------------------------------------
   **
   **   This returns those parts of a name which are given (and requested)
   **   substituting bits from the related name where necessary.
   **
   ** On entry,
   **   aName       A filename given
   **      relatedName     A name relative to which aName is to be parsed
   **      wanted          A mask for the bits which are wanted.
   **
   ** On exit,
   **   returns     A pointer to a malloc'd string which MUST BE FREED
 */
char *HTParse(const char *aName, const char *relatedName, int wanted)
{
	char *return_value = 0;
	char *p;
	char *access;
	struct struct_parts given, related;
	char name[MAX_URL_STRING+1];
	char rel[MAX_URL_STRING+1];
	char result[2*MAX_URL_STRING+1];	/* Make this longer to avoid overflow */
	BOOL bIsFile;

	/* Make working copies of input strings to cut up:
	 */
 	GTR_strncpy(name, aName, MAX_URL_STRING);
 	GTR_strncpy(rel, relatedName, MAX_URL_STRING);

	scan(name, &given);
	scan(rel, &related);
	result[0] = 0;				/* Clear string  */
	access = given.access ? given.access : related.access;
	if (wanted & PARSE_ACCESS)
		if (access)
		{
			strcat(result, access);
			if (wanted & PARSE_PUNCTUATION)
				strcat(result, ":");
		}

	if (given.access && related.access)		/* If different, inherit nothing. */
		if (strcmp(given.access, related.access) != 0)
		{
			related.host = 0;
			related.absolute = 0;
			related.relative = 0;
			related.anchor = 0;
		}
	bIsFile = (access && (!_stricmp(access, "file")));
		
	if (wanted & PARSE_HOST)
		if (given.host || related.host)
		{
			char *tail = result + strlen(result);
			if (wanted & PARSE_PUNCTUATION)
				strcat(result, "//");
			if (given.host)
			{
				strcat(result, given.host);
			}
			else
			{
				strcat(result, related.host);
			}

			/* Ignore default port numbers, and trailing dots on FQDNs
			   which will only cause identical adreesses to look different */
			{
				char *p;
				p = strchr(tail, ':');
				if (p && access)
				{				/* Port specified */
 					if (   (   strcmp(access, "http") == 0
 							&& strcmp(p, ":80") == 0)
 						|| (   strcmp(access, "gopher") == 0
 							&& strcmp(p, ":70") == 0)
#ifdef HTTPS_ACCESS_TYPE
 						|| (   strcmp(access, "https") == 0
 							&& strcmp(p, ":443") == 0)
#endif
#ifdef SHTTP_ACCESS_TYPE
 						|| (   strcmp(access, "shttp") == 0
 							&& strcmp(p, ":80") == 0)
#endif
					   )
						*p = (char) 0;	/* It is the default: ignore it */
				}
				if (!p)
					p = tail + strlen(tail);	/* After hostname */
				if (strlen (p))	/* -dpg */
				{
					p--;			/* End of hostname */
					if (*p == '.')
						*p = (char) 0;	/* chop final . */
				}
			}
		}

	if (given.host && related.host)		/* If different hosts, inherit no path. */
		if (strcmp(given.host, related.host) != 0)
		{
			related.absolute = 0;
			related.relative = 0;
			related.anchor = 0;
		}

	if (wanted & PARSE_PATH)
	{
		BOOL bOKToInheritPath = (!given.host);
		BOOL bRIsUNC = bIsFile && related.absolute && bIsUNC(related.absolute);
		BOOL bRIsDrive = bIsFile && related.absolute && bIsDrive(related.absolute);

		if (given.absolute)
		{
			BOOL bBack = FALSE;
									
		// 	All is given, except for perhaps the drive or unc root 
			if (bIsFile)
			{
				BOOL bGIsUNC = given.absolute && bIsUNC(given.absolute);
				BOOL bGIsDrive = given.absolute && bIsDrive(given.absolute);

				bBack = bRIsUNC || bRIsDrive || bGIsUNC || bGIsDrive || strchr(given.absolute, '\\'); 
				if ((!(bGIsUNC || bGIsDrive)) && (bRIsUNC || bRIsDrive))
				{
					const char *e = NULL;

					if (bRIsDrive)
					{
						e = related.absolute + 2;
					}
					else
					{
						if (e = strchr(related.absolute + 2, '\\'))
							e = strchr(e + 1, '\\');
					}
					if (e)
						GTR_strncat(result, related.absolute, e - related.absolute);
				}
			}
			if (((!bIsFile) || !bIsAbsolute(given.absolute)) && (wanted & PARSE_PUNCTUATION))
				strcat(result, bBack ? "\\" : "/");
			strcat(result, given.absolute);
		}
		else if (related.absolute && bOKToInheritPath)
		{						/* Adopt path not name */
			char slash1 = '/';
			char slash2 = slash1;
			BOOL bBack = FALSE;

			if (bIsFile)
			{
				slash2 = '\\';
				bBack = bRIsUNC || bRIsDrive || strchr(related.absolute, '\\'); 
			}
			if ((!bIsFile) || !bIsAbsolute(related.absolute))
				strcat(result, bBack ? "\\" : "/");
			strcat(result, related.absolute);

			if (given.relative)
			{
				p = strchr(result, '?');	/* Search part? */
				if (!p)
					p = result + strlen(result) - 1;
				for (; *p != slash1 && *p != slash2; p--) ;	/* last / */
				p[1] = 0;		/* Remove filename */
				strcat(result, given.relative);		/* Add given one */
				HTSimplify(result);
			}
		}
		else if (given.relative)
		{
			/* The following 3 lines were copied from NCSA Mosaic for Windows */
			if ((wanted & PARSE_HOST) && (given.host || related.host) && (wanted & PARSE_PUNCTUATION))
				if (result[strlen(result) - 1] != '/')
					strcat(result, "/");
			strcat(result, given.relative);		/* what we've got */
		}
		else if (related.relative && bOKToInheritPath)
		{
			strcat(result, related.relative);
		}
		else
		{						/* No inheritance */
			strcat(result, "/");
		}
	}

	if (wanted & PARSE_ANCHOR)
		if (given.anchor || related.anchor)
		{
			if (wanted & PARSE_PUNCTUATION)
				strcat(result, "#");
			strcat(result, given.anchor ? given.anchor : related.anchor);
		}

	/* We truncate URLs to 1024 bytes if they're too long. */
	result[MAX_URL_STRING] = '\0';
	return_value = GTR_strdup(result);

	return return_value;		/* exactly the right length */
}


/*
   **   As strcpy() but guaranteed to work correctly
   **   with overlapping parameters.    AL 7 Feb 1994
 */
PRIVATE void ari_strcpy(char *to, char *from)
{
	char *tmp;

	if (!to || !from)
		return;

	tmp = (char *) GTR_MALLOC(strlen(from) + 1);
	if (tmp)
	{
		strcpy(tmp, from);
		strcpy(to, tmp);
		GTR_FREE(tmp);
	}
	else
	{
		/* TODO */
	}
}
/*          Simplify a filename
   //       -------------------
   //
   // A unix-style file is allowed to contain the seqeunce xxx/../ which may be
   // replaced by "" , and the seqeunce "/./" which may be replaced by "/".
   // Simplification helps us recognize duplicate filenames.
   //
   //   Thus,   /etc/junk/../fred   becomes /etc/fred
   //       /etc/junk/./fred    becomes /etc/junk/fred
   //
   //      but we should NOT change
   //       http://fred.xxx.edu/../..
   //
   //   or  ../../albert.html
   //
   //	CMF 5/26/95.  Note that many servers now bounce requests like 
   //	http://fred.xxx.edu/../fred.gif with a 403 for security. 
 */
PUBLIC void HTSimplify(char *filename)
{
	char *p = filename;
	char *q;

	if (p)
	{
		while (*p && (*p == '/' || *p == '.'))	/* Pass starting / or .'s */
			p++;
		while (*p)
		{
			if (*p == '/')
			{
				if ((p[1] == '.') && (p[2] == '.') && (p[3] == '/' || !p[3]))
				{
					for (q = p - 1; (q >= filename) && (*q != '/'); q--) ;	/* prev slash */
					if (q[0] == '/' && 0 != strncmp(q, "/../", 4))
					{
						if (!(q - 1 > filename && q[-1] == '/'))
						{
							ari_strcpy(q, p + 3);	/* Remove  /xxx/..  */
							if (!*filename)
								strcpy(filename, "/");
							p = q - 1;	/* Start again with prev slash  */
						}
						else
						{
							ari_strcpy(p, p + 3);	/* Remove starting ../ */
							p = p - 1; /* Start over with rest of path */
						}
					}
					else
					{			/*   xxx/.. leave it!   */
#ifdef BUG_CODE
						ari_strcpy(filename, p[3] ? p + 4 : p + 3);		/* rm  xxx/../ */
						p = filename;	/* Start again */
#endif
					}
				}
				else if ((p[1] == '.') && (p[2] == '/' || !p[2]))
				{
					ari_strcpy(p, p + 2);	/* Remove a slash and a dot */
				}
#if 0
				else if (p[-1] != ':')
				{
					while (p[1] == '/')
					{
						ari_strcpy(p, p + 1);	/* Remove multiple slashes */
					}
				}
#endif
			}
			p++;
		}						/* end while (*p) */
	}							/* end if (p) */
}


/*      Escape undesirable characters using %       HTEscape()
   **       -------------------------------------
   **
   **   This function takes a pointer to a string in which
   **   some characters may be unacceptable unescaped.
   **   It returns a string which has these characters
   **   represented by a '%' character followed by two hex digits.
   **
   **   Unlike HTUnEscape(), this routine returns a malloced string.
 */

PRIVATE CONST unsigned char isAcceptable[96] =

/*   Bit 0       xalpha      -- see HTFile.h
**   Bit 1       xpalpha     -- as xalpha but with plus.
**   Bit 2 ...   path        -- as xpalphas but with /
*/
/*   0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F */
	{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 0, 7, 7, 4,	/* 2x   !"#$%&'()*+,-./  */
	 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0,	/* 3x  0123456789:;<=>?  */
	 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,	/* 4x  @ABCDEFGHIJKLMNO  */
	 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 7,	/* 5X  PQRSTUVWXYZ[\]^_  */
	 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,	/* 6x  `abcdefghijklmno  */
	 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0};	/* 7X  pqrstuvwxyz{\}~  DEL */

PRIVATE char *hex = "0123456789ABCDEF";

PUBLIC char *HTEscape(CONST char *str, unsigned char mask, char protect)
{
#define ACCEPTABLE(a)	((a>=32 && a<128 && ((isAcceptable[a-32]) & mask)) || a == protect)
	CONST char *p;
	char *q;
	char *result;
	int unacceptable = 0;
	
	for (p = str; *p; p++)
		if (!ACCEPTABLE((unsigned char) (*p)))
			unacceptable++;
	result = (char *) GTR_MALLOC(p - str + unacceptable + unacceptable + 1);
	if (result)
	{
		for (q = result, p = str; *p; p++)
		{
			unsigned char a = *p;
			if (!ACCEPTABLE(a))
			{
				*q++ = HEX_ESCAPE;	/* Means hex commming */
				*q++ = hex[a >> 4];
				*q++ = hex[a & 15];
			}
			else
				*q++ = *p;
		}
		*q++ = 0;					/* Terminate */
	}
	return result;
}


/*      Decode %xx escaped characters           HTUnEscape()
   **       -----------------------------
   **
   **   This function takes a pointer to a string in which some
   **   characters may have been encoded in %xy form, where xy is
   **   the acsii hex code for character 16x+y.
   **   The string is converted in place, as it will never grow.
 */

PRIVATE char from_hex(char c)
{
	return c >= '0' && c <= '9' ? c - '0'
		: c >= 'A' && c <= 'F' ? c - 'A' + 10
		: c - 'a' + 10;			/* accept small letters just in case */
}

PUBLIC char *HTUnEscape(char *str)
{
	char *p = str;
	char *q = str;
	while (*p)
	{
		if (*p == HEX_ESCAPE)
		{
			p++;
			if (*p)
				*q = from_hex(*p++) * 16;
			if (*p)
				*q = *q + from_hex(*p++);
			q++;
		}
		else
		{
			*q++ = *p++;
		}
	}

	*q++ = 0;
	return str;

}								/* HTUnEscape */
